In [17]:
import datetime
import pandas as pd
import numpy as np
import scipy.stats as ss
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.animation as animation
from IPython.display import Image, display, HTML
import folium 
%matplotlib inline
import warnings 
import pmdarima as pm
import statsmodels.api as sm
import statsmodels.tsa.seasonal as sts
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.stats.diagnostic import acorr_ljungbox
import statsmodels.graphics.tsaplots as splt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX

Jordan Lake, North Carolina¶

In [2]:
lat = 35.7809955
lon = -79.017105
targ_lat = [35.676263, 35.885728]
targ_lon = [-79.07959, -78.95462]
m = folium.Map(location=[lat, lon], zoom_start=11)
folium.Marker(location=[lat, lon], popup="point of interest").add_to(m)
display(m)
Make this Notebook Trusted to load map: File -> Trust Notebook

Read data already downloaded from CYAN project from parquet file¶

In [3]:
%%time
URL = './data/L3B/'
cols = ['clat','clon','north','south','west','east','date','CI_cyano']

f = 'L3B_CYAN_DAILY_JORDAN.parquet'
df = pd.read_parquet(URL + f, columns=cols)

df.shape
CPU times: user 78.9 ms, sys: 69.4 ms, total: 148 ms
Wall time: 152 ms
Out[3]:
(332990, 8)
In [46]:
# quick check on the data
df_loc = df.groupby(['clat','clon']).CI_cyano.count().reset_index()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,4))
scat = ax1.scatter(df_loc.clon, df_loc.clat, c=df_loc.CI_cyano, cmap='Spectral_r', s=10)
cbar = fig.colorbar(scat, ax=ax1)
ax1.set_title("Observation count by location", fontsize=12)
ax1.grid(linewidth=0.3, linestyle='-' )

ax2.hist(df.CI_cyano)
ax2.set_title('Histogram of CI_cyano')
ax2.grid(linewidth=0.25)
plt.tight_layout();
No description has been provided for this image

Prepare data for analysis¶

In [4]:
# breakdown date column to year, month, day
df['date'] = pd.to_datetime(df['date'])
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day

# categorize CI_cyano values to high, medium, low
hab_high = 0.016
hab_med = 0.001

df['HAB'] = np.where(df.CI_cyano.values >= hab_high,'high',
                         np.where(df.CI_cyano.values >= hab_med, 'med', 'low'))
df['HAB_HIGH'] = (df.HAB == 'high')*1
df['HAB_MEDIUM'] = (df.HAB == 'med')*1
df['HAB_HIGH_MED'] = df['HAB_HIGH'] + df['HAB_MEDIUM']
In [5]:
# aggreate data to daily averages
df_daily = df.groupby('date').agg(CI_cyano=('CI_cyano','mean'),
                                      nobs=('clat','count'),
                                      hab_high=('HAB_HIGH','sum'),
                                      hab_medium=('HAB_MEDIUM','sum'),
                                      hab_high_med=('HAB_HIGH_MED','sum')).reset_index()

df_daily['month'] = df_daily.date.dt.month
df_daily['week'] = df_daily.date.dt.isocalendar().week
df_daily['year'] = df_daily.date.dt.isocalendar().year

# aggreate data to weekly averages
df_weekly = df_daily.groupby(['year','week']).agg(date=('date','min'), CI_cyano=('CI_cyano','mean')).reset_index()
df_weekly = df_weekly.sort_values(by='date')

def plot_dt(x, y, title, hab_high = 0.016, hab_med = 0.001):
    plt.figure(figsize=(15, 3))
    plt.plot(x, y)
    if y.max() >= hab_med:
        plt.hlines(y=hab_med, xmin=x.min(), xmax=x.max(), color='orange', linestyles='dotted', label='HAB Medium')
    if y.max() >= hab_high:
        plt.hlines(y=hab_high, xmin=x.min(), xmax=x.max(), color='red', linestyles='dotted', label='HAB High')
    plt.title(title)
    plt.grid(linewidth=0.25);

# plot daily and weekly average trends  
plot_dt(x=df_daily.date, y=df_daily.CI_cyano, title="Average Daily CI_cyano")
plot_dt(x=df_weekly.date, y=df_weekly.CI_cyano, title="Average Weekly CI_cyano")
No description has been provided for this image
No description has been provided for this image
In [6]:
print('Last 4 week average CI level:')
df_weekly['HAB Level'] = np.where(df_weekly.CI_cyano.values >= hab_high,'High',
                         np.where(df_weekly.CI_cyano.values >= hab_med, 'Medium', 'Low'))
df_weekly[-4:]
Last 4 week average CI level:
Out[6]:
year week date CI_cyano HAB Level
407 2024 10 2024-03-07 0.00005 Low
408 2024 11 2024-03-11 0.00005 Low
409 2024 12 2024-03-19 0.00005 Low
410 2024 13 2024-03-25 0.00005 Low

Spatial Analysis: HAB by Location¶

In [7]:
# aggreate data by lat/lon locations
df_loc = df.groupby(['clat','clon']).agg(CI_cyano=('CI_cyano','mean'),
                                             CI_cyano_median=('CI_cyano','median'),
                                             nobs=('date','count'),
                                             hab_high=('HAB_HIGH','sum'),
                                             hab_high_med=('HAB_HIGH_MED','sum')).reset_index()
df_loc['pct_hab_high'] = df_loc.hab_high/df_loc.nobs
df_loc['pct_hab_high_med'] = df_loc.hab_high_med/df_loc.nobs


fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 6))

for v, ax, t in zip(['CI_cyano','pct_hab_high_med','pct_hab_high'],
                      [ax1, ax2, ax3],
                      ['Average CI_cyano Level', '% days with MEDIUM or above\nHAB level (CI >= 0.001)',
                      '% days with HIGH or above\nHAB level (CI >= 0.016)']):
    
    scat = ax.scatter(df_loc.clon, df_loc.clat, c=df_loc[v], cmap='Spectral_r', s=40, alpha=0.8)
    cbar = fig.colorbar(scat, ax=ax)
    ax.set_title(t, fontsize=15)
    ax.grid(linewidth=0.3, linestyle='-' )

plt.suptitle('Average CI_cyano Levels by Location', fontsize=20)
plt.tight_layout();
No description has been provided for this image

Temporal Analysis: HAB by Time (Year & Month)¶

In [8]:
# plot data by month to reveal seasonality
df_daily['pct_hab_high'] = df_daily.hab_high/df_daily.nobs
df_daily['pct_hab_high_med'] = df_daily.hab_high_med/df_daily.nobs

fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(9, 10))
sns.set_theme(style='white')
sns.set(font_scale=0.8)
for v, ax, f, t in zip(['CI_cyano','pct_hab_high_med','pct_hab_high'],
                      [ax1, ax2, ax3],
                      [".4f", ".1%", ".1%"],
                      ['Average Monthly CI_cyano', '% area with MEDIUM or above HAB level (CI >= 0.001)',
                      '% area with HIGH or above HAB level (CI >= 0.016)']):
    
    pivot_ci = df_daily.pivot_table(index='year', columns='month', values=v, aggfunc='mean', margins=True)
    sns.heatmap(pivot_ci, ax=ax, annot=True, cmap='Spectral_r', fmt=f, linewidths=0.5) 
    ax.set_title(t, fontsize=12)
    ax.set_xlabel('Month', fontsize=10)
    ax.set_ylabel('Year', fontsize=10)
    
plt.suptitle('Monthly CI_cyano Levels Over the years', fontsize=16)
plt.tight_layout();
No description has been provided for this image

Monthly CI Cyano change over time¶

In [9]:
data = df.groupby(['clat','clon','Year','Month']).CI_cyano.mean().reset_index()
data['frame'] = (data.Year.astype('str')+'-'+data.Month.astype('str').apply(lambda x: x.zfill(2)))

data2 = df.groupby(['Year','Month']).agg(nobs = ('CI_cyano','count'), CI_cyano=('CI_cyano','mean'), CI_cyano_max=('CI_cyano','max'), HAB_HIGH_MED=('HAB_HIGH_MED','sum') ).reset_index()
data2['PCT_HAB_HIGH_MED'] = data2.HAB_HIGH_MED/data2.nobs
data2['day'] = 1
data2['date'] = pd.to_datetime(data2[['Year', 'Month', 'day']])
data2 = data2.set_index('date')[['CI_cyano','CI_cyano_max','PCT_HAB_HIGH_MED']]

yrange1 = [data.clat.min()-0.01, data.clat.max()+0.01]
xrange1 = [data.clon.min()-0.01, data.clon.max()+0.01]
xrange2 = [data2.index.values.min(), data2.index.values.max()]
yrange2_1 = [0, data2.CI_cyano.max()*1.05]
yrange2_2 = [0, data2.CI_cyano_max.max()*1.05]
yrange2_3 = [0, data2.PCT_HAB_HIGH_MED.max()*1.05]

frames = data.frame.sort_values().unique()

fig_w = 4
fig_h = int(round((yrange1[1]-yrange1[0])/(xrange1[1]-xrange1[0])*fig_w,0))

fig = plt.figure(layout='constrained', figsize=(fig_w*3, fig_h))
subfigs = fig.subfigures(1, 2, wspace=0.07, width_ratios=[1, 1])
sns.set_theme(style='white')
plt.rc('xtick', labelsize=8)
plt.rc('ytick', labelsize=8)

vmin = 0
vmax = 0.02

# Create the initial scatter plot with the specified colormap and color range
axsLeft = subfigs[0].subplots(1,1, sharey=True)
axsLeft.set_xlim(xrange1)
axsLeft.set_ylim(yrange1)
scat = axsLeft.scatter(data.clon, data.clat, c=data.CI_cyano, cmap='Spectral_r', s=30, alpha=0.8, vmin=vmin, vmax=vmax)
cbar = fig.colorbar(scat, ax=axsLeft)

# Create an initial empty line plot
axsRight = subfigs[1].subplots(3, 1, sharex=True)

line1, = axsRight[0].plot([], [], lw=1, color = 'blue', label='Avg CI cyano')
if yrange2_1[1] >= hab_med:
    axsRight[0].hlines(y=hab_med, xmin=data2.index[0], xmax=data2.index[-1], color='orange', linestyles='dotted', label='HAB Medium')
if yrange2_1[1] >= hab_high:
    axsRight[0].hlines(y=hab_high, xmin=data2.index[0], xmax=data2.index[-1], color='red', linestyles='dotted', label='HAB High')
axsRight[0].legend()

line2, = axsRight[1].plot([], [], lw=1, color = 'red', label='Max CI cyano')
if data2.CI_cyano_max.max() >= hab_med:
    axsRight[1].hlines(y=hab_med, xmin=data2.index[0], xmax=data2.index[-1], color='orange', linestyles='dotted', label='HAB Medium')
if data2.CI_cyano_max.max() >= hab_high:
    axsRight[1].hlines(y=hab_high, xmin=data2.index[0], xmax=data2.index[-1], color='red', linestyles='dotted', label='HAB High')
axsRight[1].legend()

line3, = axsRight[2].plot([], [], lw=1, color = 'red')

#ax2.set_xticks(np.arange(1,len(frames)+1))
axsRight[2].set_xlim(xrange2)
axsRight[0].set_ylim(yrange2_1)
axsRight[1].set_ylim(yrange2_2)
axsRight[2].set_ylim(yrange2_3)

def animate(i):
    axsLeft.clear()
    axsLeft.set_xlim(xrange1)
    axsLeft.set_ylim(yrange1)
    
    # Update the scatter plot with the same colormap and color range
    scat = axsLeft.scatter(data[data.frame == frames[i]].clon,
                       data[data.frame == frames[i]].clat,
                       c=data[data.frame == frames[i]].CI_cyano,
                       cmap='Spectral_r', vmin=vmin, vmax=vmax,
                       s=30, alpha=0.8)

    axsLeft.set_title(f'Month: {frames[i]}', fontsize=12)
    axsLeft.grid(linewidth=0.3, linestyle='-' )

    # Update the line plot with the line data for the current frame
    line1.set_data(data2.index.values[:i+1], data2.CI_cyano.values[:i+1])
    line2.set_data(data2.index.values[:i+1], data2.CI_cyano_max.values[:i+1])
    line3.set_data(data2.index.values[:i+1], data2.PCT_HAB_HIGH_MED.values[:i+1])

    axsRight[0].set_title(f'Avg CI cyano: {data2.CI_cyano.values[i]:.3f}', fontsize=12)
    axsRight[1].set_title(f'Max CI cyano: {data2.CI_cyano_max.values[i]:.3f}', fontsize=12)
    axsRight[2].set_title(f'% area with Medium/High HAB levels: {data2.PCT_HAB_HIGH_MED.values[i]:.1%}', fontsize=12)
    axsRight[0].grid(linewidth=0.3, linestyle='-' )
    axsRight[1].grid(linewidth=0.3, linestyle='-' )
    axsRight[2].grid(linewidth=0.3, linestyle='-' )

    return scat, line1, line2, line3

ani = animation.FuncAnimation(fig, animate, repeat=False, frames=len(frames), interval=1000)

#To save the animation using Pillow as a gif
writer = animation.PillowWriter(fps=1,
                                metadata=dict(artist='Me'),
                                bitrate=1000)

ani.save('scatter.gif', writer=writer)

with open('scatter.gif', 'rb') as f:
    display(Image(data=f.read(), format='png'))
No description has been provided for this image
No description has been provided for this image
In [10]:
# show animated plot as video
HTML(ani.to_html5_video())
Out[10]:
Your browser does not support the video tag.

Impute missing weeks with mean¶

In [11]:
## impute missing week data
week_first = df_weekly[df_weekly.date==df_weekly.date.min()].week.values[0]
week_last = df_weekly[df_weekly.date==df_weekly.date.max()].week.values[0]
years = df_weekly.year.unique()

df_weekly_imp = df_weekly.copy()
append_rows = {'year':[], 'week':[], 'date':[]}

cicyano_mean = df_weekly.groupby('week').CI_cyano.mean().reset_index()

for w in range(1,53):
    if w < week_first:
        yy = years[1:]
    elif w > week_last:
        yy = years[:-1]
    else:
        yy = years
    tmpdf = df_weekly_imp[df_weekly_imp.week==w]
    y_avg = tmpdf.CI_cyano.mean()
    for y in yy:
        tmpdf2 = tmpdf[tmpdf.year==y]
        if len(tmpdf2)==0:
            append_rows['year'].append(y)
            append_rows['week'].append(w)
            append_rows['date'].append(datetime.datetime.strptime(str(y)+'-W'+str(w)+'-1', '%G-W%V-%u'))
            
df_append = pd.DataFrame(append_rows)
df_append['week'] = df_append['week'].astype('UInt32')
df_append = pd.merge_asof(left=df_append, right=cicyano_mean, on='week', direction='nearest')
df_weekly_imp = pd.concat([df_weekly_imp, df_append], axis=0, ignore_index=True)
df_weekly_imp = df_weekly_imp.sort_values(by='date').reset_index(drop=True)

sns.set_theme(style='white')
plt.figure(figsize=(10, 1.5))
pivot = pd.pivot_table(df_weekly, values='CI_cyano', index='year', columns='week', aggfunc='mean', margins=True)
sns.heatmap(pivot, annot=False, cmap='Spectral_r', linewidths=0.5)
plt.title('Weekly CI_cyano - Before imputation', fontsize=15)
plt.show()

plt.figure(figsize=(10, 1.5))
pivot = pd.pivot_table(df_weekly_imp,values='CI_cyano',index='year',columns='week',aggfunc='mean',margins=True)
sns.heatmap(pivot, annot=False, cmap='Spectral_r', linewidths=0.5)
plt.title('Weekly CI_cyano - After imputation', fontsize=15)
plt.show()
No description has been provided for this image
No description has been provided for this image

Prepare data for SARIMA model¶

In [12]:
# log-transform CI_cyano values
fcdat = df_weekly_imp.copy()
fcdat['log_y'] = np.log(fcdat.CI_cyano)

plt.figure(figsize=(15, 3))
plt.plot(fcdat.date, fcdat.log_y)
plt.title('plot of log transformed CI_cyano')
plt.grid(linewidth=0.25);
No description has been provided for this image
In [13]:
# split data into train/test, use 10% data as test
test_size = int(len(fcdat) * 0.1)
train_test = ['train']*(len(fcdat)-test_size)+['test']*test_size
fcdat['train_test'] = train_test

dtrain = fcdat[fcdat['train_test']=='train'].copy()
dtest = fcdat[fcdat['train_test']=='test'].copy()
fcdat.shape, dtrain.shape, dtest.shape
Out[13]:
((417, 7), (376, 7), (41, 7))
In [14]:
# quick check data distribution
def dist_ts(ts, lab = '', bins = 40):
    _,ax = plt.subplots(1,2,figsize=(10,4))
    ## Plot the histogram with labels
    ts.hist(ax = ax[0], bins = bins, alpha = 0.5);
    ax[0].set_xlabel('Value');
    ax[0].set_ylabel('Frequency');
    ax[0].set_title('Histogram of ' + lab);
    ## Plot the q-q plot on the other axes
    ss.probplot(ts, plot = ax[1]);

dist_ts(fcdat.log_y, lab = '\n log transformed values')
No description has been provided for this image

Stationarity Test - ADF/KPSS¶

In [15]:
def test_stationarity(series, regression_model='c', rounding=3):
    """Function to test the stationarity of time series using:
    1. Augmented Dicky Fuller test using AIC 
    2. KPSS test for constant
    
    Arguments:  
        Series: the time series to be evaluated.   
        regression_model: model to be used:   
            'c' - model with a constant   
            'ct' - model wiht a constant and trend  
        rounding: the number of digits to display in printed output.   
    """
    if regression_model not in ['c','ct']:    
        print('Error!: argument regression model must be one of {\'c\',\'ct\'}')
        return
    with warnings.catch_warnings(): 
        warnings.filterwarnings("ignore")
        kpss_test = kpss(series, regression=regression_model)
        ADF_test = adfuller(series, regression=regression_model, autolag="AIC")
    return [round(ADF_test[0],rounding), round(ADF_test[4]['5%'],rounding), round(ADF_test[1],rounding),
                                       round(kpss_test[0],rounding), round(kpss_test[1],rounding)]
def stationary_tests(series, names, regression_model='c', rounding=3):
    test_columns = ['ADF_statistic','ADF_conf_int','ADF_p_value','KPSS_statistic','KPSS_p_value']
    out = pd.DataFrame(index=names, columns=test_columns)    
    for ser, name in zip(series,names):  
        out.loc[name,:] = test_stationarity(ser, regression_model=regression_model, rounding=rounding)
    return out

def auto_partial_corr_plot(ts, lags=40):
    _,ax = plt.subplots(2,1, figsize=(8,4))
    _=splt.plot_acf(ts, lags=lags, ax=ax[0])
    ax[0].grid(linewidth=0.5)
    _=splt.plot_pacf(ts, lags=lags, method='yw', ax=ax[1])
    ax[1].grid(linewidth=0.5)
    plt.tight_layout();
In [18]:
sns.set_theme(style='white')
display(stationary_tests([fcdat['log_y']], ['log transformed']))
print("ACF & PACF plots:")
auto_partial_corr_plot(fcdat['log_y'], lags=110)
ADF_statistic ADF_conf_int ADF_p_value KPSS_statistic KPSS_p_value
log transformed -4.857 -2.869 0.0 0.08 0.1
ACF & PACF plots:
No description has been provided for this image

Seasonal Differencing¶

In [19]:
seasonal_diff = fcdat['log_y'].diff(52)
plt.figure(figsize=(15, 3))
plt.plot(fcdat.date, seasonal_diff)
plt.title('Seasonal difference of log of CI_cyano')
plt.grid(linewidth=0.5);

display(stationary_tests([seasonal_diff[52:]], ['Seasonal diff of log y']))
print("ACF & PACF plots:")
auto_partial_corr_plot(seasonal_diff[52:], lags=110)
ADF_statistic ADF_conf_int ADF_p_value KPSS_statistic KPSS_p_value
Seasonal diff of log y -3.748 -2.87 0.003 0.104 0.1
ACF & PACF plots:
No description has been provided for this image
No description has been provided for this image

Seasonality Decomposition¶

In [20]:
def decomp_ts(ts, period=52, model = 'additive'):
    res = sts.seasonal_decompose(ts, period = period, model = model) 
    res.plot()
    return(pd.DataFrame({'resid': res.resid, 
                         'trend': res.trend, 
                         'seasonal': res.seasonal},
                       index = ts.index) )

decomp = decomp_ts(fcdat.set_index('date').CI_cyano) 
No description has been provided for this image
In [21]:
def evaluate_resids(residual_series, title='residual series', regression_model='c', ljungbox=True, ADF=True):
    dist_ts(residual_series, title) 
    auto_partial_corr_plot(residual_series)
    if(ljungbox==True): print(acorr_ljungbox(residual_series, lags=40, model_df=0))
    if(ADF==True): print(stationary_tests([residual_series], [title], regression_model=regression_model))
        
evaluate_resids(decomp[26:-26].resid)   
       lb_stat     lb_pvalue
1   118.825162  1.143797e-27
2   188.564641  1.131640e-41
3   234.450733  1.508262e-50
4   258.799228  8.274428e-55
5   266.920148  1.283184e-55
6   270.028860  2.138659e-55
7   275.909504  8.368777e-56
8   278.242439  1.745216e-55
9   279.124423  6.932572e-55
10  279.138631  3.956402e-54
11  279.139435  2.150172e-53
12  280.548786  5.635882e-53
13  285.456063  2.639530e-53
14  292.811713  3.722250e-54
15  303.183527  1.218128e-55
16  316.763958  8.522971e-58
17  332.766349  1.869545e-60
18  344.673552  2.891402e-62
19  358.665573  1.648992e-64
20  375.082848  2.958989e-67
21  386.576666  5.524940e-69
22  400.731163  2.903474e-71
23  418.252054  3.081321e-74
24  442.995908  1.058247e-78
25  464.102776  2.047998e-82
26  478.248145  1.084270e-84
27  485.939436  1.226584e-85
28  491.964592  3.035246e-86
29  499.010688  4.595895e-87
30  501.792401  5.182056e-87
31  505.475207  3.774684e-87
32  506.445654  9.753768e-87
33  506.678643  3.512810e-86
34  507.397921  9.921796e-86
35  507.880045  3.088258e-85
36  508.336471  9.595459e-85
37  508.571105  3.261822e-84
38  508.593389  1.207494e-83
39  509.764150  2.588925e-83
40  511.000996  5.324831e-83
                ADF_statistic ADF_conf_int ADF_p_value KPSS_statistic  \
residual series        -6.448        -2.87         0.0          0.026   

                KPSS_p_value  
residual series          0.1  
No description has been provided for this image
No description has been provided for this image

Train SARIMA Model¶

In [22]:
# tune the hyperparameters of the SARIMA model
best_model = pm.auto_arima(dtrain.log_y, 
                             max_p=3, max_q=3, 
                             max_P=3, max_Q=3, 
                             m=52, seasonal=True,
                             d=0, D=1, trace=True,
                             information_criterion = 'bic',
                             error_action='ignore',  # don't want to know if an order does not work
                             suppress_warnings=True,  # don't want convergence warnings
                             stepwise=True)  # set to stepwise

a_order = best_model.get_params()['order']
s_order = best_model.get_params()['seasonal_order']
c_trend=None
if best_model.get_params()['with_intercept']:
    c_trend = 'c'

model_name = 'SARIMA'+str(a_order)+'x'+str(s_order)
Performing stepwise search to minimize bic
 ARIMA(2,0,2)(1,1,1)[52] intercept   : BIC=inf, Time=27.11 sec
 ARIMA(0,0,0)(0,1,0)[52] intercept   : BIC=1256.984, Time=0.29 sec
 ARIMA(1,0,0)(1,1,0)[52] intercept   : BIC=934.036, Time=5.55 sec
 ARIMA(0,0,1)(0,1,1)[52] intercept   : BIC=inf, Time=10.13 sec
 ARIMA(0,0,0)(0,1,0)[52]             : BIC=1252.288, Time=0.24 sec
 ARIMA(1,0,0)(0,1,0)[52] intercept   : BIC=999.238, Time=0.90 sec
 ARIMA(1,0,0)(2,1,0)[52] intercept   : BIC=913.663, Time=17.64 sec
 ARIMA(1,0,0)(3,1,0)[52] intercept   : BIC=917.255, Time=39.84 sec
 ARIMA(1,0,0)(2,1,1)[52] intercept   : BIC=inf, Time=50.02 sec
 ARIMA(1,0,0)(1,1,1)[52] intercept   : BIC=inf, Time=10.07 sec
 ARIMA(1,0,0)(3,1,1)[52] intercept   : BIC=inf, Time=205.68 sec
 ARIMA(0,0,0)(2,1,0)[52] intercept   : BIC=1147.548, Time=12.16 sec
 ARIMA(2,0,0)(2,1,0)[52] intercept   : BIC=911.840, Time=22.51 sec
 ARIMA(2,0,0)(1,1,0)[52] intercept   : BIC=932.042, Time=7.27 sec
 ARIMA(2,0,0)(3,1,0)[52] intercept   : BIC=915.510, Time=53.22 sec
 ARIMA(2,0,0)(2,1,1)[52] intercept   : BIC=inf, Time=46.54 sec
 ARIMA(2,0,0)(1,1,1)[52] intercept   : BIC=inf, Time=11.44 sec
 ARIMA(2,0,0)(3,1,1)[52] intercept   : BIC=inf, Time=233.64 sec
 ARIMA(3,0,0)(2,1,0)[52] intercept   : BIC=915.143, Time=26.97 sec
 ARIMA(2,0,1)(2,1,0)[52] intercept   : BIC=915.365, Time=44.50 sec
 ARIMA(1,0,1)(2,1,0)[52] intercept   : BIC=910.048, Time=25.02 sec
 ARIMA(1,0,1)(1,1,0)[52] intercept   : BIC=929.582, Time=9.66 sec
 ARIMA(1,0,1)(3,1,0)[52] intercept   : BIC=913.922, Time=67.20 sec
 ARIMA(1,0,1)(2,1,1)[52] intercept   : BIC=inf, Time=65.20 sec
 ARIMA(1,0,1)(1,1,1)[52] intercept   : BIC=inf, Time=13.03 sec
 ARIMA(1,0,1)(3,1,1)[52] intercept   : BIC=inf, Time=200.06 sec
 ARIMA(0,0,1)(2,1,0)[52] intercept   : BIC=1009.801, Time=16.61 sec
 ARIMA(1,0,2)(2,1,0)[52] intercept   : BIC=915.226, Time=30.12 sec
 ARIMA(0,0,2)(2,1,0)[52] intercept   : BIC=965.915, Time=23.30 sec
 ARIMA(2,0,2)(2,1,0)[52] intercept   : BIC=914.605, Time=99.79 sec
 ARIMA(1,0,1)(2,1,0)[52]             : BIC=904.762, Time=12.65 sec
 ARIMA(1,0,1)(1,1,0)[52]             : BIC=923.875, Time=4.29 sec
 ARIMA(1,0,1)(3,1,0)[52]             : BIC=908.806, Time=37.04 sec
 ARIMA(1,0,1)(2,1,1)[52]             : BIC=inf, Time=42.45 sec
 ARIMA(1,0,1)(1,1,1)[52]             : BIC=inf, Time=13.06 sec
 ARIMA(1,0,1)(3,1,1)[52]             : BIC=inf, Time=179.73 sec
 ARIMA(0,0,1)(2,1,0)[52]             : BIC=1008.385, Time=9.36 sec
 ARIMA(1,0,0)(2,1,0)[52]             : BIC=908.812, Time=10.96 sec
 ARIMA(2,0,1)(2,1,0)[52]             : BIC=910.030, Time=21.47 sec
 ARIMA(1,0,2)(2,1,0)[52]             : BIC=909.887, Time=16.97 sec
 ARIMA(0,0,0)(2,1,0)[52]             : BIC=1147.992, Time=5.28 sec
 ARIMA(0,0,2)(2,1,0)[52]             : BIC=962.768, Time=11.32 sec
 ARIMA(2,0,0)(2,1,0)[52]             : BIC=906.676, Time=12.15 sec
 ARIMA(2,0,2)(2,1,0)[52]             : BIC=909.278, Time=69.64 sec

Best model:  ARIMA(1,0,1)(2,1,0)[52]          
Total fit time: 1822.715 seconds
In [23]:
# train the SARIMA model with the optimized hyperparameters
smodel = SARIMAX(dtrain.log_y, order=a_order, seasonal_order=s_order, trend=c_trend).fit(disp=False)

print(smodel.summary())
smodel.plot_diagnostics(figsize=(16, 6))
plt.show();
                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                               log_y   No. Observations:                  376
Model:             SARIMAX(1, 0, 1)x(2, 1, [], 52)   Log Likelihood                -437.929
Date:                             Sun, 28 Apr 2024   AIC                            885.858
Time:                                     21:47:27   BIC                            904.762
Sample:                                          0   HQIC                           893.404
                                             - 376                                         
Covariance Type:                               opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.8435      0.043     19.725      0.000       0.760       0.927
ma.L1         -0.2539      0.072     -3.551      0.000      -0.394      -0.114
ar.S.L52      -0.6705      0.053    -12.762      0.000      -0.773      -0.567
ar.S.L104     -0.3333      0.051     -6.556      0.000      -0.433      -0.234
sigma2         0.8011      0.049     16.386      0.000       0.705       0.897
===================================================================================
Ljung-Box (L1) (Q):                   0.02   Jarque-Bera (JB):                42.23
Prob(Q):                              0.89   Prob(JB):                         0.00
Heteroskedasticity (H):               0.94   Skew:                            -0.06
Prob(H) (two-sided):                  0.74   Kurtosis:                         4.76
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
No description has been provided for this image
In [24]:
print(smodel.summary())
sns.set_theme(style='white')
smodel.plot_diagnostics(figsize=(16, 6))
sns.set_theme(style='white')
auto_partial_corr_plot(smodel.resid[53:])
                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                               log_y   No. Observations:                  376
Model:             SARIMAX(1, 0, 1)x(2, 1, [], 52)   Log Likelihood                -437.929
Date:                             Sun, 28 Apr 2024   AIC                            885.858
Time:                                     21:47:28   BIC                            904.762
Sample:                                          0   HQIC                           893.404
                                             - 376                                         
Covariance Type:                               opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.8435      0.043     19.725      0.000       0.760       0.927
ma.L1         -0.2539      0.072     -3.551      0.000      -0.394      -0.114
ar.S.L52      -0.6705      0.053    -12.762      0.000      -0.773      -0.567
ar.S.L104     -0.3333      0.051     -6.556      0.000      -0.433      -0.234
sigma2         0.8011      0.049     16.386      0.000       0.705       0.897
===================================================================================
Ljung-Box (L1) (Q):                   0.02   Jarque-Bera (JB):                42.23
Prob(Q):                              0.89   Prob(JB):                         0.00
Heteroskedasticity (H):               0.94   Skew:                            -0.06
Prob(H) (two-sided):                  0.74   Kurtosis:                         4.76
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
No description has been provided for this image
No description has been provided for this image
In [25]:
evaluate_resids(smodel.resid[53:]) 
      lb_stat  lb_pvalue
1    0.004402   0.947102
2    0.588790   0.744982
3    0.602896   0.895769
4    2.097493   0.717833
5    2.559669   0.767483
6    6.118773   0.410018
7    6.200937   0.516493
8    6.456425   0.596245
9    6.724942   0.665732
10  10.002373   0.440285
11  11.018811   0.441689
12  11.058256   0.523933
13  11.704027   0.552050
14  11.721177   0.628683
15  11.849812   0.690360
16  13.249801   0.654413
17  13.607124   0.694691
18  13.611429   0.754048
19  14.224320   0.770461
20  15.710156   0.734435
21  17.799681   0.661643
22  17.919874   0.710728
23  21.036780   0.578841
24  23.010670   0.519168
25  23.061654   0.573963
26  26.912567   0.413929
27  26.912877   0.468503
28  27.175336   0.508715
29  29.166714   0.456409
30  29.358696   0.498818
31  30.543110   0.489395
32  30.555540   0.539660
33  30.679373   0.583154
34  30.797388   0.625394
35  30.942149   0.664417
36  31.910280   0.663527
37  31.949842   0.704580
38  33.272091   0.687658
39  34.545796   0.673145
40  36.844982   0.613076
                ADF_statistic ADF_conf_int ADF_p_value KPSS_statistic  \
residual series       -17.822       -2.871         0.0          0.209   

                KPSS_p_value  
residual series          0.1  
No description has been provided for this image
No description has been provided for this image

Forecast with SARIMA model¶

In [26]:
# function to plot the forecasted data
def plot_fcst(mod):
    sfitted = mod.get_prediction(start=dtrain.index[0], end=dtrain.index[-1], dynamic=False).conf_int()
    sfitted = np.exp(sfitted)
    sfitted['log_yhat'] = mod.fittedvalues
    sfitted['yhat'] = np.exp(mod.fittedvalues)
    dfout = pd.concat([dtrain, sfitted], axis=1)
    dfout = dfout.rename(columns={'lower log_y':'yhat_lower', 'upper log_y':'yhat_upper'})
    dfout['train_test'] = 'train'

    sfcst = mod.get_forecast(test_size+10, dynamic=True).summary_frame()

    sfcst['date'] = [dtrain.date.max() + datetime.timedelta(days=7*(i+1)) for i in range(len(sfcst))]
    sfcst['yhat'] = np.exp(sfcst['mean'])
    sfcst['yhat_lower'] = np.exp(sfcst['mean_ci_lower'])
    sfcst['yhat_upper'] = np.exp(sfcst['mean_ci_upper'])
    sfcst = sfcst.rename(columns={"mean": "log_yhat"})

    dtestx = dtest.copy()
    dtestx = pd.merge_asof(left=dtestx, right=sfcst, on='date', direction='nearest')
    dtestx['train_test'] = 'test'
    dfout = pd.concat([dfout, dtestx], axis=0)
    
    #plt.figure(figsize=(10, 2))
    fig = plt.figure(figsize=(12, 2))
    ax = plt.subplot(111)
    ax.plot(dfout.date, dfout.CI_cyano, label='actual data')
    ax.plot(dfout.date[53:], dfout.yhat[53:], label='prediction')
    
    ax.fill_between(dfout.date[53:],
                     dfout.yhat_lower[53:],
                     dfout.yhat_upper[53:],
                     color = "orange", alpha=0.3,label='CI bound')
    
    if dfout.CI_cyano.max() >=hab_med or dfout.yhat[53:].max() >=hab_med:
        ax.hlines(y=hab_med, xmin=dfout.date.min(), xmax=dfout.date.max(), 
                   color='orange', linestyles='dotted', label='HAB Medium')
        
    if dfout.CI_cyano.max() >=hab_high or dfout.yhat[53:].max() >=hab_high:
        ax.hlines(y=hab_high, xmin=dfout.date.min(), xmax=dfout.date.max(), 
                   color='red', linestyles='dotted', label='HAB High')
        
    ax.axvline(x=dtest.date.min(),linestyle=':',color='k')
    ax.set_ylabel('CI_cyano', fontsize=12)
            
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    plt.title(model_name, fontsize=15)
    plt.grid(linewidth=0.3)
    plt.ylim(0, dfout.yhat[53:].max()*1.1);
    #plt.legend();
    
    return dfout
In [27]:
sns.set_theme(style='white')
# plot train and test fit
fcdat_out = plot_fcst(smodel)

# plot test results
plt.figure(figsize=(15, 3))
plt.plot(dtest.date, fcdat_out[fcdat_out.train_test=='test'].CI_cyano, label='actual')
plt.plot(dtest.date, fcdat_out[fcdat_out.train_test=='test'].yhat, label='forecast')

# plt.fill_between(dtest.date, fcdat_out[fcdat_out.train_test=='test'].yhat_lower,
#                      fcdat_out[fcdat_out.train_test=='test'].yhat_upper,
#                      color = "orange", alpha=0.3,label='forecast bound')

if dtest.CI_cyano.max() >=hab_high or fcdat_out[fcdat_out.train_test=='test'].yhat.max() >=hab_high:
    plt.hlines(y=hab_high, xmin=dtest.date.min(), xmax=dtest.date.max(), 
                color='red', linestyles='dotted', label='HAB High')
if dtest.CI_cyano.max() >=hab_med or fcdat_out[fcdat_out.train_test=='test'].yhat.max() >=hab_med:
    plt.hlines(y=hab_med, xmin=dtest.date.min(), xmax=dtest.date.max(), 
               color='orange', linestyles='dotted', label='HAB Medium')
plt.title('Prediction on Test dataset', fontsize=15)
plt.legend()
plt.grid(linewidth=0.3);
/opt/anaconda3/envs/capsproj/lib/python3.10/site-packages/pandas/core/internals/blocks.py:366: RuntimeWarning: overflow encountered in exp
  result = func(self.values, **kwargs)
No description has been provided for this image
No description has been provided for this image

Prediction Accuracy¶

In [28]:
# function to calculation prediction accuracy
def acc_metric(ytrue, ypred, hab_high=0.016, hab_med=0.001):
    
    ytrue_hab = np.where(ytrue >= hab_high, 'high', np.where(ytrue >= hab_med, 'med', 'low'))
    ypred_hab = np.where(ypred >= hab_high, 'high', np.where(ypred >= hab_med, 'med', 'low'))
        
    acc = np.mean(ytrue_hab==ypred_hab)
    level_perf = {}
    level_cnt, sum_pre, sum_rec = 0, 0, 0
    for l in ['low','med','high']:
        acc_l, pre_l, rec_l = None, None, None
        if np.sum(ytrue_hab==l)>0:
            level_cnt += 1
            acc_l = np.sum((ypred_hab==l)*(ytrue_hab==l))/np.sum(ytrue_hab==l)
            rec_l = np.sum((ypred_hab==l)*(ytrue_hab==l))/np.sum(ytrue_hab==l)
        if np.sum(ypred_hab==l)>0:
            pre_l = np.sum((ypred_hab==l)*(ytrue_hab==l))/np.sum(ypred_hab==l)            
        level_perf[l] = [acc_l, pre_l, rec_l]
        if pre_l: sum_pre += pre_l
        if rec_l: sum_rec += rec_l
    avg_pre = sum_pre/level_cnt
    avg_rec = sum_rec/level_cnt
    
    df = pd.DataFrame({'act':ytrue_hab, 'fcst':ypred_hab})
    df['obs']=1
    df_out = df.pivot_table(index='act',columns='fcst',values='obs',aggfunc='sum', margins=True)
        
    return {'Accuracy':acc, 
            'Acc-Low':level_perf['low'][0],
            'Acc-Med':level_perf['med'][0],
            'Acc-High':level_perf['high'][0],
            'Avg Precision': avg_pre,
            'Precision-Low':level_perf['low'][1],
            'Precision-Med':level_perf['med'][1],
            'Precision-High':level_perf['high'][1],
            'Avg Recall': avg_rec,
            'Recall-Low':level_perf['low'][2],
            'Recall-Med':level_perf['med'][2],
            'Recall-High':level_perf['high'][2]}, df_out

def sarima_accuracy(df, offset=53):
    df_train = df[df.train_test=='train'][offset:]
    df_test = df[df.train_test=='test']
    tr, mattr = acc_metric(df_train.CI_cyano.values, df_train.yhat.values)
    te, matte = acc_metric(df_test.CI_cyano.values, df_test.yhat.values)

    print('Accuracy Performance:')
    pd.options.display.float_format = '{:.1%}'.format
    pout = pd.DataFrame({'train':tr, 'test':te})
    display(pout[:4])
    display(pout[4:8])
    display(pout[8:])
    print('\nConfusion Matrix - Train:')
    display(mattr)
    print('\nConfusion Matrix - Test:')
    display(matte)
    return pout, mattr, matte
In [29]:
# assess model performance on accuracy, precision and recall:
perf_out, _, _ = sarima_accuracy(fcdat_out)
Accuracy Performance:
train test
Accuracy 89.2% 95.1%
Acc-Low 93.2% 91.7%
Acc-Med 84.2% 100.0%
Acc-High NaN NaN
train test
Avg Precision 89.7% 94.7%
Precision-Low 88.2% 100.0%
Precision-Med 91.1% 89.5%
Precision-High 0.0% NaN
train test
Avg Recall 88.7% 95.8%
Recall-Low 93.2% 91.7%
Recall-Med 84.2% 100.0%
Recall-High NaN NaN
Confusion Matrix - Train:
fcst high low med All
act
low NaN 16500.0% 1200.0% 177
med 100.0% 2200.0% 12300.0% 146
All 100.0% 18700.0% 13500.0% 323
Confusion Matrix - Test:
fcst low med All
act
low 2200.0% 200.0% 24
med NaN 1700.0% 17
All 2200.0% 1900.0% 41

Model "Rolling" Forecast¶

In [30]:
# forecast results for p weeks
def rolling_forecast(p, a_order, s_order, c_trend):
    fcst, fcst_low, fcst_high = [], [], []
    for t in range(test_size):
        dtrainx = fcdat[0+t:len(dtrain.index)+t]
        # re-train model
        newmod = SARIMAX(dtrainx.log_y, order=a_order, seasonal_order=s_order, trend=c_trend)
        newmod_fit = newmod.fit(disp=False)

        fcstx_results = newmod_fit.get_forecast(steps=p, dynamic=True)
        fcstx = fcstx_results.predicted_mean
        fcst_confintx = fcstx_results.conf_int()

        fcst.append(fcstx.values)
        fcst_low.append(fcst_confintx['lower log_y'].values)
        fcst_high.append(fcst_confintx['upper log_y'].values)
    
    return np.array(fcst), np.array(fcst_low), np.array(fcst_high)
In [31]:
# prediction accuracy on 'rolling' forecast
dfcst, dlow, dhigh = rolling_forecast(p=10, a_order=a_order, s_order=s_order, c_trend=c_trend)

plt.figure(figsize=(15, 4))
plt.plot(dtest.date, dtest.CI_cyano, label='actual')
plt.plot(dtest.date, np.exp(dfcst[:,0]), label='forecast')

if dtest.CI_cyano.max() >=hab_high or np.exp(dfcst[:,0]).max()>=hab_high:
    plt.hlines(y=hab_high, xmin=dtest.date.min(), xmax=dtest.date.max(), 
                color='red', linestyles='dotted', label='HAB High')
if dtest.CI_cyano.max() >=hab_med or np.exp(dfcst[:,0]).max()>=hab_med:
    plt.hlines(y=hab_med, xmin=dtest.date.min(), xmax=dtest.date.max(), 
               color='orange', linestyles='dotted', label='HAB Medium')
    
plt.grid(linewidth=0.25)
plt.legend();

te, matte = acc_metric(dtest.CI_cyano.values, np.exp(dfcst[:,0]))

print('Accuracy Performance:')

display(pd.DataFrame(te, index=['Rolling forecast']))

print('\nConfusion Matrix - Test:')
display(matte)
Accuracy Performance:
Accuracy Acc-Low Acc-Med Acc-High Avg Precision Precision-Low Precision-Med Precision-High Avg Recall Recall-Low Recall-Med Recall-High
Rolling forecast 95.1% 95.8% 94.1% None 95.0% 95.8% 94.1% None 95.0% 95.8% 94.1% None
Confusion Matrix - Test:
fcst low med All
act
low 23 1 24
med 1 16 17
All 24 17 41
No description has been provided for this image
In [32]:
# forcast window
p = 10
dtest_rolling = pd.DataFrame()
for t in range(test_size):
    tmpdf = fcdat[len(dtrain.index)+t:len(dtrain.index)+t+p]
    tmpdf = tmpdf.CI_cyano.reset_index(drop=True)
    missing = dtest_rolling.shape[0]-tmpdf.shape[0]
    if missing>0:
        tmpdf = pd.concat([tmpdf, pd.DataFrame([0]*missing)]).reset_index(drop=True)
                                         
    dtest_rolling = pd.concat([dtest_rolling, tmpdf], axis=1)
    
dtest_rolling = dtest_rolling.T
dtest_rolling.columns=['y_w'+str(i+1) for i in range(p)]
dtest_rolling.index = dtest.index


rolling_acc = pd.DataFrame()
for i in range(p):
    col = 'y_w'+str(i+1)
    n_fcst = (dtest_rolling['y_w'+str(i+1)]>0).sum()
    a, _ = acc_metric(dtest_rolling[dtest_rolling[col]>0][col].values, np.exp(dfcst[:n_fcst,i]))
    tmpdf = pd.DataFrame(a, index=["Week "+str(i+1)])
    rolling_acc = pd.concat([rolling_acc, tmpdf], axis=0)

print(f'Avg accuracy = {rolling_acc.Accuracy.mean():.2%}')
print(f'Avg accuracy (low) = {rolling_acc["Acc-Low"].mean():.2%}')
print(f'Avg accuracy (med) = {rolling_acc["Acc-Med"].mean():.2%}')
print(f'Avg accuracy (high) = {rolling_acc["Acc-High"].mean():.2%}')
rolling_acc
Avg accuracy = 94.74%
Avg accuracy (low) = 92.50%
Avg accuracy (med) = 99.41%
Avg accuracy (high) = nan%
Out[32]:
Accuracy Acc-Low Acc-Med Acc-High Avg Precision Precision-Low Precision-Med Precision-High Avg Recall Recall-Low Recall-Med Recall-High
Week 1 95.1% 95.8% 94.1% None 95.0% 95.8% 94.1% None 95.0% 95.8% 94.1% None
Week 2 97.5% 95.8% 100.0% None 97.1% 100.0% 94.1% None 97.9% 95.8% 100.0% None
Week 3 94.9% 91.7% 100.0% None 94.1% 100.0% 88.2% None 95.8% 91.7% 100.0% None
Week 4 94.7% 91.7% 100.0% None 93.8% 100.0% 87.5% None 95.8% 91.7% 100.0% None
Week 5 94.6% 91.7% 100.0% None 93.3% 100.0% 86.7% None 95.8% 91.7% 100.0% None
Week 6 94.4% 91.7% 100.0% None 92.9% 100.0% 85.7% None 95.8% 91.7% 100.0% None
Week 7 94.3% 91.7% 100.0% None 92.3% 100.0% 84.6% None 95.8% 91.7% 100.0% None
Week 8 94.1% 91.7% 100.0% None 91.7% 100.0% 83.3% None 95.8% 91.7% 100.0% None
Week 9 93.9% 91.7% 100.0% None 90.9% 100.0% 81.8% None 95.8% 91.7% 100.0% None
Week 10 93.8% 91.7% 100.0% None 90.0% 100.0% 80.0% None 95.8% 91.7% 100.0% None
In [33]:
rolling_acc.Accuracy.plot(title="Forecast Accuracy by week", figsize=(15, 4))
rolling_acc['Acc-Low'].plot()
rolling_acc['Acc-Med'].plot()
plt.legend();
No description has been provided for this image

Next 3 months Forecast¶

In [34]:
# train the SARIMA model with the optimized hyperparameters
model = SARIMAX(fcdat.log_y, order=a_order, seasonal_order=s_order, trend=c_trend).fit(disp=False)

# predict values for next 3 months
pred3m = model.get_forecast(12, dynamic=True).summary_frame()

pred3m['date'] = [fcdat.date.max() + datetime.timedelta(days=7*(i+1)) for i in range(len(pred3m))]
pred3m['Predicted CI_cyano'] = np.exp(pred3m['mean'])
pred3m['yhat_lower'] = np.exp(pred3m['mean_ci_lower'])
pred3m['yhat_upper'] = np.exp(pred3m['mean_ci_upper'])
pred3m = pred3m.rename(columns={"mean": "log_yhat"})
pred3m['HAB Level'] = np.where(pred3m['Predicted CI_cyano'].values >= hab_high,'High',
                         np.where(pred3m['Predicted CI_cyano'].values >= hab_med, 'Medium', 'Low'))
In [35]:
pd.options.display.float_format = '{:.6f}'.format
pred3m[['date','Predicted CI_cyano','HAB Level']]
Out[35]:
log_y date Predicted CI_cyano HAB Level
417 2024-04-29 0.000666 Low
418 2024-05-06 0.000323 Low
419 2024-05-13 0.000372 Low
420 2024-05-20 0.002785 Medium
421 2024-05-27 0.001241 Medium
422 2024-06-03 0.002684 Medium
423 2024-06-10 0.005342 Medium
424 2024-06-17 0.003239 Medium
425 2024-06-24 0.004473 Medium
426 2024-07-01 0.005131 Medium
427 2024-07-08 0.003218 Medium
428 2024-07-15 0.003756 Medium
In [37]:
# plot test results
plt.figure(figsize=(18, 3))
ax = plt.subplot(111)
ax.plot(fcdat.date[-53:], fcdat.CI_cyano[-53:], label='Historical CI_cyano')
ax.plot(pred3m.date, pred3m['Predicted CI_cyano'], label='Predicted CI_cyano')

# ax.fill_between(pred3m.date, pred3m.yhat_lower,pred3m.yhat_upper,
#                 color = "orange", alpha=0.3,label='CI bound')

ax.hlines(y=hab_med, xmin=fcdat.date[-53:].min(), xmax=pred3m.date.max(), 
                color='orange', linestyles='dotted', label='HAB Medium')
ax.set_title('Historical CI cyano and next 3 months predictions', fontsize=15)

box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
plt.grid(linewidth=0.3);
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]: